{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dimensionality Reduction for High-Dimensional Time Series"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this example, we reduce the dimensionality of a time series input from\n",
    "1000 features to 100. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Import some libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "qvMXmSSHoHW_"
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from scipy.stats import expon, randint\n",
    "from sklearn.utils.testing import ignore_warnings\n",
    "from sklearn.exceptions import ConvergenceWarning\n",
    "from sklearn.externals import joblib\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "from sklearn.decomposition import KernelPCA, PCA\n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import your tools for cleaning, encoding if needed, and data scaling\n",
    "\n",
    "import data_cleaning\n",
    "import data_encoding\n",
    "import data_scaling"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read the raw data "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(79073, 1106)\n"
     ]
    }
   ],
   "source": [
    "# Load raw data \n",
    "X_data_all = pd.read_csv('high_dim_data.csv', index_col = 'timestamp')\n",
    "print(X_data_all.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>None_avg</th>\n",
       "      <th>None_max</th>\n",
       "      <th>None_median</th>\n",
       "      <th>None_min</th>\n",
       "      <th>None_p95</th>\n",
       "      <th>None_var</th>\n",
       "      <th>anti_entropy_saturation_avg</th>\n",
       "      <th>anti_entropy_saturation_max</th>\n",
       "      <th>anti_entropy_saturation_median</th>\n",
       "      <th>batch_remove_batch_remove_dropped_avg</th>\n",
       "      <th>...</th>\n",
       "      <th>write_view_ratio_avg</th>\n",
       "      <th>write_view_ratio_avg_pit</th>\n",
       "      <th>write_view_ratio_max</th>\n",
       "      <th>write_view_ratio_max_pit</th>\n",
       "      <th>write_view_ratio_median</th>\n",
       "      <th>write_view_ratio_median_pit</th>\n",
       "      <th>write_view_ratio_sum</th>\n",
       "      <th>write_view_ratio_sum_pit</th>\n",
       "      <th>label</th>\n",
       "      <th>issue_id</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>timestamp</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2019-02-01 00:01:58.699</th>\n",
       "      <td>1.333333</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.353553</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>CASSANDRA_LEVELED_COMPACTION_MISUSE</td>\n",
       "      <td>4018b55b-3996-11e9-943b-42010a8a0058</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2019-02-01 00:02:20.877</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2019-02-01 00:02:23.539</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2019-02-01 00:02:44.801</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>CASSANDRA_TOMBSTONE_ACCUMULATION</td>\n",
       "      <td>e4fa365b-3995-11e9-8a7e-42010a8a004f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2019-02-01 00:02:44.973</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>CASSANDRA_LWT_MISUSE</td>\n",
       "      <td>899c0c76-3996-11e9-9335-42010a8a0015</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 1106 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                         None_avg  None_max  None_median  None_min  None_p95  \\\n",
       "timestamp                                                                      \n",
       "2019-02-01 00:01:58.699  1.333333       2.0          1.0       1.0       1.9   \n",
       "2019-02-01 00:02:20.877  0.000000       0.0          0.0       0.0       0.0   \n",
       "2019-02-01 00:02:23.539  0.000000       0.0          0.0       0.0       0.0   \n",
       "2019-02-01 00:02:44.801  0.000000       0.0          0.0       0.0       0.0   \n",
       "2019-02-01 00:02:44.973  0.000000       0.0          0.0       0.0       0.0   \n",
       "\n",
       "                         None_var  anti_entropy_saturation_avg  \\\n",
       "timestamp                                                        \n",
       "2019-02-01 00:01:58.699  0.353553                          0.0   \n",
       "2019-02-01 00:02:20.877       NaN                          0.0   \n",
       "2019-02-01 00:02:23.539       NaN                          0.0   \n",
       "2019-02-01 00:02:44.801       NaN                          0.0   \n",
       "2019-02-01 00:02:44.973       NaN                          0.0   \n",
       "\n",
       "                         anti_entropy_saturation_max  \\\n",
       "timestamp                                              \n",
       "2019-02-01 00:01:58.699                          0.0   \n",
       "2019-02-01 00:02:20.877                          0.0   \n",
       "2019-02-01 00:02:23.539                          0.0   \n",
       "2019-02-01 00:02:44.801                          0.0   \n",
       "2019-02-01 00:02:44.973                          0.0   \n",
       "\n",
       "                         anti_entropy_saturation_median  \\\n",
       "timestamp                                                 \n",
       "2019-02-01 00:01:58.699                             0.0   \n",
       "2019-02-01 00:02:20.877                             0.0   \n",
       "2019-02-01 00:02:23.539                             0.0   \n",
       "2019-02-01 00:02:44.801                             0.0   \n",
       "2019-02-01 00:02:44.973                             0.0   \n",
       "\n",
       "                         batch_remove_batch_remove_dropped_avg  ...  \\\n",
       "timestamp                                                       ...   \n",
       "2019-02-01 00:01:58.699                                    0.0  ...   \n",
       "2019-02-01 00:02:20.877                                    0.0  ...   \n",
       "2019-02-01 00:02:23.539                                    0.0  ...   \n",
       "2019-02-01 00:02:44.801                                    0.0  ...   \n",
       "2019-02-01 00:02:44.973                                    0.0  ...   \n",
       "\n",
       "                         write_view_ratio_avg  write_view_ratio_avg_pit  \\\n",
       "timestamp                                                                 \n",
       "2019-02-01 00:01:58.699                   0.0                       0.0   \n",
       "2019-02-01 00:02:20.877                   0.0                       0.0   \n",
       "2019-02-01 00:02:23.539                   0.0                       0.0   \n",
       "2019-02-01 00:02:44.801                   0.0                       0.0   \n",
       "2019-02-01 00:02:44.973                   0.0                       0.0   \n",
       "\n",
       "                         write_view_ratio_max  write_view_ratio_max_pit  \\\n",
       "timestamp                                                                 \n",
       "2019-02-01 00:01:58.699                   0.0                       0.0   \n",
       "2019-02-01 00:02:20.877                   0.0                       0.0   \n",
       "2019-02-01 00:02:23.539                   0.0                       0.0   \n",
       "2019-02-01 00:02:44.801                   0.0                       0.0   \n",
       "2019-02-01 00:02:44.973                   0.0                       0.0   \n",
       "\n",
       "                         write_view_ratio_median  write_view_ratio_median_pit  \\\n",
       "timestamp                                                                       \n",
       "2019-02-01 00:01:58.699                      0.0                          0.0   \n",
       "2019-02-01 00:02:20.877                      0.0                          0.0   \n",
       "2019-02-01 00:02:23.539                      0.0                          0.0   \n",
       "2019-02-01 00:02:44.801                      0.0                          0.0   \n",
       "2019-02-01 00:02:44.973                      0.0                          0.0   \n",
       "\n",
       "                         write_view_ratio_sum  write_view_ratio_sum_pit  \\\n",
       "timestamp                                                                 \n",
       "2019-02-01 00:01:58.699                   0.0                       0.0   \n",
       "2019-02-01 00:02:20.877                   0.0                       0.0   \n",
       "2019-02-01 00:02:23.539                   0.0                       0.0   \n",
       "2019-02-01 00:02:44.801                   0.0                       0.0   \n",
       "2019-02-01 00:02:44.973                   0.0                       0.0   \n",
       "\n",
       "                                                       label  \\\n",
       "timestamp                                                      \n",
       "2019-02-01 00:01:58.699  CASSANDRA_LEVELED_COMPACTION_MISUSE   \n",
       "2019-02-01 00:02:20.877                                  NaN   \n",
       "2019-02-01 00:02:23.539                                  NaN   \n",
       "2019-02-01 00:02:44.801     CASSANDRA_TOMBSTONE_ACCUMULATION   \n",
       "2019-02-01 00:02:44.973                 CASSANDRA_LWT_MISUSE   \n",
       "\n",
       "                                                     issue_id  \n",
       "timestamp                                                      \n",
       "2019-02-01 00:01:58.699  4018b55b-3996-11e9-943b-42010a8a0058  \n",
       "2019-02-01 00:02:20.877                                   NaN  \n",
       "2019-02-01 00:02:23.539                                   NaN  \n",
       "2019-02-01 00:02:44.801  e4fa365b-3995-11e9-8a7e-42010a8a004f  \n",
       "2019-02-01 00:02:44.973  899c0c76-3996-11e9-9335-42010a8a0015  \n",
       "\n",
       "[5 rows x 1106 columns]"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_data_all.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Do the required cleaning for the dataset\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(79073, 569)\n"
     ]
    }
   ],
   "source": [
    "# data cleaning module deletes columns with all nan or 0s \n",
    "# can add more data cleaning checks as needed\n",
    "\n",
    "X_data_clean = data_cleaning.cleaner('high_dim_data.csv')\n",
    "\n",
    "print(X_data_clean.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>timestamp</th>\n",
       "      <th>None_avg</th>\n",
       "      <th>None_max</th>\n",
       "      <th>None_median</th>\n",
       "      <th>None_min</th>\n",
       "      <th>None_p95</th>\n",
       "      <th>None_var</th>\n",
       "      <th>batch_remove_batch_remove_dropped_var</th>\n",
       "      <th>batch_store_batch_store_dropped_var</th>\n",
       "      <th>blocked_anti_entropy_var</th>\n",
       "      <th>...</th>\n",
       "      <th>write_two_timeouts_count_var</th>\n",
       "      <th>write_two_unavailables_count_var</th>\n",
       "      <th>write_unavailables_count_avg</th>\n",
       "      <th>write_unavailables_count_max</th>\n",
       "      <th>write_unavailables_count_median</th>\n",
       "      <th>write_unavailables_count_min</th>\n",
       "      <th>write_unavailables_count_p95</th>\n",
       "      <th>write_unavailables_count_var</th>\n",
       "      <th>label</th>\n",
       "      <th>issue_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2019-02-01 00:01:58.699</td>\n",
       "      <td>1.333333</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.353553</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CASSANDRA_LEVELED_COMPACTION_MISUSE</td>\n",
       "      <td>4018b55b-3996-11e9-943b-42010a8a0058</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2019-02-01 00:02:20.877</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>84.333333</td>\n",
       "      <td>253.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>227.7</td>\n",
       "      <td>1.414214</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2019-02-01 00:02:23.539</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2019-02-01 00:02:44.801</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CASSANDRA_TOMBSTONE_ACCUMULATION</td>\n",
       "      <td>e4fa365b-3995-11e9-8a7e-42010a8a004f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2019-02-01 00:02:44.973</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CASSANDRA_LWT_MISUSE</td>\n",
       "      <td>899c0c76-3996-11e9-9335-42010a8a0015</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 569 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 timestamp  None_avg  None_max  None_median  None_min  \\\n",
       "0  2019-02-01 00:01:58.699  1.333333       2.0          1.0       1.0   \n",
       "1  2019-02-01 00:02:20.877  0.000000       0.0          0.0       0.0   \n",
       "2  2019-02-01 00:02:23.539  0.000000       0.0          0.0       0.0   \n",
       "3  2019-02-01 00:02:44.801  0.000000       0.0          0.0       0.0   \n",
       "4  2019-02-01 00:02:44.973  0.000000       0.0          0.0       0.0   \n",
       "\n",
       "   None_p95  None_var  batch_remove_batch_remove_dropped_var  \\\n",
       "0       1.9  0.353553                                    NaN   \n",
       "1       0.0       NaN                                    NaN   \n",
       "2       0.0       NaN                                    NaN   \n",
       "3       0.0       NaN                                    NaN   \n",
       "4       0.0       NaN                                    NaN   \n",
       "\n",
       "   batch_store_batch_store_dropped_var  blocked_anti_entropy_var  ...  \\\n",
       "0                                  NaN                       NaN  ...   \n",
       "1                                  NaN                       NaN  ...   \n",
       "2                                  NaN                       NaN  ...   \n",
       "3                                  NaN                       NaN  ...   \n",
       "4                                  NaN                       NaN  ...   \n",
       "\n",
       "   write_two_timeouts_count_var  write_two_unavailables_count_var  \\\n",
       "0                           NaN                               NaN   \n",
       "1                           NaN                               NaN   \n",
       "2                           NaN                               NaN   \n",
       "3                           NaN                               NaN   \n",
       "4                           NaN                               NaN   \n",
       "\n",
       "   write_unavailables_count_avg  write_unavailables_count_max  \\\n",
       "0                      0.000000                           0.0   \n",
       "1                     84.333333                         253.0   \n",
       "2                      0.000000                           0.0   \n",
       "3                      0.000000                           0.0   \n",
       "4                      0.000000                           0.0   \n",
       "\n",
       "   write_unavailables_count_median  write_unavailables_count_min  \\\n",
       "0                              0.0                           0.0   \n",
       "1                              0.0                           0.0   \n",
       "2                              0.0                           0.0   \n",
       "3                              0.0                           0.0   \n",
       "4                              0.0                           0.0   \n",
       "\n",
       "   write_unavailables_count_p95  write_unavailables_count_var  \\\n",
       "0                           0.0                           NaN   \n",
       "1                         227.7                      1.414214   \n",
       "2                           0.0                           NaN   \n",
       "3                           0.0                           NaN   \n",
       "4                           0.0                           NaN   \n",
       "\n",
       "                                 label                              issue_id  \n",
       "0  CASSANDRA_LEVELED_COMPACTION_MISUSE  4018b55b-3996-11e9-943b-42010a8a0058  \n",
       "1                                  NaN                                   NaN  \n",
       "2                                  NaN                                   NaN  \n",
       "3     CASSANDRA_TOMBSTONE_ACCUMULATION  e4fa365b-3995-11e9-8a7e-42010a8a004f  \n",
       "4                 CASSANDRA_LWT_MISUSE  899c0c76-3996-11e9-9335-42010a8a0015  \n",
       "\n",
       "[5 rows x 569 columns]"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_data_clean.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read the raw data and clean it"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "4SeIz8DueUiT"
   },
   "outputs": [],
   "source": [
    "# after cleaning the data we're ready to encode any categorical features and labels\n",
    "X_data_enc, y_data_enc = data_encoding.data_enc(X_data_clean)\n",
    "X_data_enc = X_data_enc.iloc[:,5:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "Ew1hAObuf_3o"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>None_avg</th>\n",
       "      <th>None_max</th>\n",
       "      <th>None_median</th>\n",
       "      <th>None_min</th>\n",
       "      <th>None_p95</th>\n",
       "      <th>None_var</th>\n",
       "      <th>batch_remove_batch_remove_dropped_var</th>\n",
       "      <th>batch_store_batch_store_dropped_var</th>\n",
       "      <th>blocked_anti_entropy_var</th>\n",
       "      <th>blocked_compactions_var</th>\n",
       "      <th>...</th>\n",
       "      <th>write_timeouts_count_var</th>\n",
       "      <th>write_two_failures_count_var</th>\n",
       "      <th>write_two_timeouts_count_var</th>\n",
       "      <th>write_two_unavailables_count_var</th>\n",
       "      <th>write_unavailables_count_avg</th>\n",
       "      <th>write_unavailables_count_max</th>\n",
       "      <th>write_unavailables_count_median</th>\n",
       "      <th>write_unavailables_count_min</th>\n",
       "      <th>write_unavailables_count_p95</th>\n",
       "      <th>write_unavailables_count_var</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.333333</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.353553</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.353553</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.131371</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>84.333333</td>\n",
       "      <td>253.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>227.7</td>\n",
       "      <td>1.414214</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.353553</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.131371</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.414214</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.353553</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.131371</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.414214</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.353553</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.131371</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.414214</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 562 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   None_avg  None_max  None_median  None_min  None_p95  None_var  \\\n",
       "0  1.333333       2.0          1.0       1.0       1.9  0.353553   \n",
       "1  0.000000       0.0          0.0       0.0       0.0  0.353553   \n",
       "2  0.000000       0.0          0.0       0.0       0.0  0.353553   \n",
       "3  0.000000       0.0          0.0       0.0       0.0  0.353553   \n",
       "4  0.000000       0.0          0.0       0.0       0.0  0.353553   \n",
       "\n",
       "   batch_remove_batch_remove_dropped_var  batch_store_batch_store_dropped_var  \\\n",
       "0                                    0.0                                  0.0   \n",
       "1                                    0.0                                  0.0   \n",
       "2                                    0.0                                  0.0   \n",
       "3                                    0.0                                  0.0   \n",
       "4                                    0.0                                  0.0   \n",
       "\n",
       "   blocked_anti_entropy_var  blocked_compactions_var  ...  \\\n",
       "0                       0.0                      0.0  ...   \n",
       "1                       0.0                      0.0  ...   \n",
       "2                       0.0                      0.0  ...   \n",
       "3                       0.0                      0.0  ...   \n",
       "4                       0.0                      0.0  ...   \n",
       "\n",
       "   write_timeouts_count_var  write_two_failures_count_var  \\\n",
       "0                  0.000000                           0.0   \n",
       "1                  1.131371                           0.0   \n",
       "2                  1.131371                           0.0   \n",
       "3                  1.131371                           0.0   \n",
       "4                  1.131371                           0.0   \n",
       "\n",
       "   write_two_timeouts_count_var  write_two_unavailables_count_var  \\\n",
       "0                           0.0                               0.0   \n",
       "1                           0.0                               0.0   \n",
       "2                           0.0                               0.0   \n",
       "3                           0.0                               0.0   \n",
       "4                           0.0                               0.0   \n",
       "\n",
       "   write_unavailables_count_avg  write_unavailables_count_max  \\\n",
       "0                      0.000000                           0.0   \n",
       "1                     84.333333                         253.0   \n",
       "2                      0.000000                           0.0   \n",
       "3                      0.000000                           0.0   \n",
       "4                      0.000000                           0.0   \n",
       "\n",
       "   write_unavailables_count_median  write_unavailables_count_min  \\\n",
       "0                              0.0                           0.0   \n",
       "1                              0.0                           0.0   \n",
       "2                              0.0                           0.0   \n",
       "3                              0.0                           0.0   \n",
       "4                              0.0                           0.0   \n",
       "\n",
       "   write_unavailables_count_p95  write_unavailables_count_var  \n",
       "0                           0.0                      0.000000  \n",
       "1                         227.7                      1.414214  \n",
       "2                           0.0                      1.414214  \n",
       "3                           0.0                      1.414214  \n",
       "4                           0.0                      1.414214  \n",
       "\n",
       "[5 rows x 562 columns]"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Do some nan imputation. If there's a large number of nans in the features we cannot do dimensionality reduction with them.\n",
    "X_data_enc_clean = X_data_enc.fillna(method='ffill')\n",
    "X_data_enc_clean.fillna(value=0, inplace=True)\n",
    "X_data_enc_clean.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Scale the data "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "pGo0E2Yl9s6S"
   },
   "outputs": [],
   "source": [
    "# As for any data, neeed to scale it. Here using standardization\n",
    "X_data_std,_,scaler_std,_ = data_scaling.scale_data(X_data_enc_clean)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 244
    },
    "colab_type": "code",
    "id": "ivW3QRmIgSuy",
    "outputId": "8f125ee4-2ce5-4aa4-b6fd-e5c188f98042"
   },
   "outputs": [],
   "source": [
    "# now we're ready to do dimensionality reduction."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "xK6p3qLEhww0"
   },
   "outputs": [],
   "source": [
    "# load the dimensionality reduction mudule\n",
    "# first let's do some exploration with PCA\n",
    "# here we examine by hand, which features make the most sense, and also test them out with a \n",
    "# classifier\n",
    "import dim_red"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3dd5hdVb3/8fdnJpn0TkhIAiRUiYCU0CwURYqAYAUEEYWLiKj3IlfFggW9j1wvyvWKICqK0n5cVJpUacpFSgIhhBIIKaSSnklmJlO/vz/2HjiZTDkp55w5Z39ez3Oes9vZ57uGsL9nrbXX2ooIzMwsu6pKHYCZmZWWE4GZWcY5EZiZZZwTgZlZxjkRmJllnBOBmVnGORFY2ZP0e0k/zPPYeyV9pgAxTJQUkvps63N38X07SVovqboY32eVzYnAikbSPEkN6QWs/fWLYsYQEcdHxPXF/E5J90v6QSfbT5a0dEuSR0S8ERGDI6J120RpWeZEYMV2UnoBa39dWOqAiuD3wKclqcP2TwM3RkTL5pysWLUOyw4nAusVJF0t6bac9cslPaTEkZIWSvqmpBVpzeKMLs4zQtLdkpZLWp0uT8jZ/6ikc9PlsyU9Lum/0mPnSjo+59hhkn4raYmkRZJ+2N4UI6k6/dwKSXOAE7op3u3ASOB9uXECJwJ/SNdPkPScpFpJCyR9L+fY9mancyS9ATzcsSlK0mclvSxpnaQ5kj6f8/n2v99XJS1Ly/PZnP0DJF0hab6ktenfZEC671BJT0haI+l5SUd2U04rU04E1lt8Fdg3vTi/DzgH+Ey8PQfKWGA7YDzwGeBaSXt2cp4q4HfAzsBOQAPQXfPTIcCs9Nz/Cfw255f79UALsBuwP3AMcG66719ILuT7A1OAj3f1BRHRANwKnJWz+ZPAKxHxfLpel+4fTpJUviDplA6nOgLYCzi2k69ZlsYzFPgs8DNJB+TsHwsMI/n7nQNclSYjgP8CDgTeTZKwvga0SRoP/BX4Ybr9YuBPkkZ3VVYrUxHhl19FeQHzgPXAmpzXv+TsPxhYBcwHTs/ZfiTJBXlQzrZbge+ky78HftjFd+4HrM5ZfxQ4N10+G5ids28gECQXzTFAIzAgZ//pwCPp8sPA+Tn7jkk/26eLON4LrG0/H/B/wL9187e6EvhZujwxPfcuOfsn9vB9twNfyfn7NeQeS5I4DiVJnA3Auzo5x9eBP3bYdj9Jgi75vye/tt3LbY1WbKdExN862xERT6fNLNuTXOhzrY6Iupz1+cC4jueQNBD4GXAc0P6Ld4ik6ui8Y3VpzvfXp5WBwSS/gPsCS3Ka9quABenyuJzl9ni6FBGPS1oOnCzpaeAg4KM5cR8C/BjYG6gB+gH/2+E0C+hC2qT1XWCPNM6BwAs5h6yMjfsi6tNybgf0B17v5LQ7A5+QdFLOtr7AI12X1MqRm4as15D0RZIL4GKS5olcIyQNylnfKT2uo68CewKHRMRQ4PD2029mOAtIagTbRcTw9DU0It6Z7l8C7Nghnp78gaT559PAAxHxZs6+m4A7gR0jYhhwTScxdzpVsKR+wJ9ImnjGRMRw4J5OPt+ZFcAGYNdO9i0gqREMz3kNiogf53FeKyNOBNYrSNqDpC36TJIL5dck7dfhsO9Lqkn7EE5k01/MAENImjrWSBpJ8it5s0XEEuAB4ApJQyVVSdpV0hHpIbcCX5Y0IW1r/0Yep/0DcDRJ/0LHW1iHAKsiYoOkg4FPbUa47TWI5UBLWjs4Jp8PRkQbcB3wU0nj0k7ww9LkcgNwkqRj0+39047nCd2f1cqNE4EV213aeBzBX9I7X24ALo+I5yPiNeCbwB/TCxIkTTirSWoBN5K0z7/SyfmvBAaQ/NJ9ErhvK2I9i+Qi+1L63bcBO6T7fk3SXv488Czw555OFhHzgCeAQSS//nNdAPxA0jrgUjZtGuvuvOuAL6efWU2SRDqevzsXkzQjPUPSR3M5UBURC4CTSf5bLCepIfw7vm5UHEX4wTTWu6W3LN4QEf4lalYAzuxmZhnnRGBmlnFuGjIzyzjXCMzMMq7sBpRtt912MXHixFKHYWZWVqZNm7YiIjqdHqTsEsHEiROZOnVqqcMwMysrkroc/e6mITOzjHMiMDPLOCcCM7OMcyIwM8s4JwIzs4wrWCKQdF36WLyZXeyXpJ9Lmi1pRoenKZmZWZEUskbwe5KHg3TleGD39HUecHUBYzEzsy4UbBxBRPxd0sRuDjkZ+EMkc1w8KWm4pB3SeeCtTEQEjS1tNDS1Ut/cSkNTC/VNrdQ1ttLY0kprW9DcGrS0tb213NrWlmxrbaOlLYiAoP2dt9ZJl9u/J3c/bPwZPFWKZcCUiSM5fI9t/8joUg4oG8/Gj95bmG7bJBFIOo+k1sBOO+XzICjbUo0trSxdu4HFazawZG0DS2s3sKa+mdV1TaxpaGZNfRNr6ptZ29BMfVMr9U0ttPWSa7A29xlkZmXm/CN2rbhE0Nn/tp1eUiLiWuBagClTpvSSy075amsL5q+q57U31/H68jpeX76e15evZ8Gqelasb9rk+H59qhgxsIbhA/syfGBfdtt+MMMG9GVgTR8G1lQzoKaagelrQE0fBvZNlvv1raZvtaiuEn2rq5L3qir6VIs+VaJPuq26KvmnIJKLudBGF/XcbckxHY53BjDbKqVMBAvZ+JmvE+j8GbS2ld6s3cAz81bxwsK1zFi4lpmL1rKu8e3nmI8Z2o9dRw/mg5PHsMOwAewwrD/jhifvY4f1Z2BN2c1EYmaboZT/h98JXCjpFuAQYK37B7aN1XVNPPH6Sv45ZwVPvL6SOcvrAKiprmKvcUM5Zf/x7DN+GHuOHcIuowcxpH/fEkdsZqVUsEQg6WbgSGA7SQtJHiLeFyAirgHuAT4EzAbqgc8WKpYsWLCqnvtfXMoDL73J1HmraAsYVFPNwZNGctpBO3LoLqN4x9ih1PTx0BEz21gh7xo6vYf9AXyxUN+fBavrmrj7hSXc/twips1fDcCeY4ZwwZG7cdQ7tmffCcPoW+0Lv5l1z42/ZSYiePaN1Vz3+DweeGkpza3BHmMG87Xj9uTEfcax06iBpQ7RzMqME0GZaGlt496ZS/nt43OZvmANwwb05azDJvLRA8YzeYehvnPGzLaYE0Ev19YW3DNzCVc88CpzV9QxcdRALjv5nXzswAm+m8fMtglfSXqxZ+at4vt3vcjMRbXsOWYI15x5IMdMHkNVlX/9m9m240TQC61Y38iP732F26YtZNyw/vz0k+/i5P3GvzXwysxsW3Ii6GX+OmMJ37r9BeoaW/jCkbvypffv5iYgMysoX2F6idoNzXz3jhf5y3OLeNeOw7niE/uy2/ZDSh2WmWWAE0Ev8Owbq/nSTc+xtHYD/3r07lx41G708f3/ZlYkTgQldtNTb/DdO2cydlh/bjv/MPbfaUSpQzKzjHEiKJHm1jYuvWMmNz+9gMP3GM3PT9uP4QNrSh2WmWWQE0EJ1DW2cMGNz/LYq8v54lG7ctEH9/QdQWZWMk4ERbamvomzrnuamYvW8uOP7sNpB/tBO2ZWWk4ERbRuQzOfue5pXlmyjms/PYWjJ48pdUhmZk4ExdLQ1Mo5v5/KzMW1XHPmgU4CZtZr+B7FImhrCy66dTrPzF/Fz07djw86CZhZL+JEUAQ/+9ur3DtzKd88fi8+/K5xpQ7HzGwjTgQFdtfzi/mfh2dz6pQdOfd9k0odjpnZJpwICmj+yjou+fMLHLjzCC47ZW8/M8DMeiUnggJpamnjSzc/R3WV+Pnp+/tZwWbWa/muoQK54sFZzFi4lmvOPJDxwweUOhwzsy75Z2oBPL9gDb/++xxOO2hHjtt7bKnDMTPrVo81AklTgPcB44AGYCbwt4hYVeDYylJTSxtfu20Go4f045sn7FXqcMzMetRljUDS2ZKeBS4BBgCzgGXAe4EHJV0vyfMjdHD1o68z6811/OiUfRjav2+pwzEz61F3NYJBwHsioqGznZL2A3YH3ihEYOXojZX1XPXobE7cdwePHDazstFlIoiIq7r7YERM3/bhlLf/uOdlqiW+fcLkUodiZpa3vDuLJZ0k6SlJ0yVdUMigytETs1dw34tL+eJRuzJ2WP9Sh2Nmlrfu+gje1WHTp4FDgQOALxQyqHLT0trGD+5+iQkjBnDu+3YpdThmZpuluz6CC5QMhb00IpYCC4AfAW3A4mIEVy7+39QFvLJ0HVefcQD9+1aXOhwzs83SXR/B59Nawa8kTQW+A7wbGAhcVqT4er0Nza38/KHXmLLzCI8ZMLOy1G0fQUQ8HxEnA9OBO4EdIuLOiGgsSnRl4IYn5/NmbSMXH7un5xIys7LUXR/B+ZKeS8cSDAKOA0ZIul/S+4oWYS9W39TCNY+9znt2G8Whu4wqdThmZlukuxrBBRGxP0kH8b9HREtE/Bw4DfhIUaLr5W6btpAV65v416P3KHUoZmZbrLvO4kWSLiMZVfxK+8aIWA1cVOjAervWtuC3j89l/52Gc9DEkaUOx8xsi3WXCE4GjgWagQeLE075ePClN5m/sp6vH/eOUodiZrZVumsaGhcRd0XEfRHR2nGnEhO6O7mk4yTNkjRb0jc62T9M0l2Snpf0oqTPbkEZSuI3/5jDhBEDOMZTSZhZmeuuRvATSVXAHcA0YDnQH9gNOAr4APBdYGFnH5ZUDVwFfDA95hlJd0bESzmHfRF4KSJOkjQamCXpxoho2spyFdT0BWuYOn81l544mT7VnsnbzMpbd+MIPiFpMnAG8DlgB6AeeBm4B/hRRGzo5twHA7MjYg6ApFtImptyE0EAQ9KBa4OBVUDLlhenOG54cj6Daqr55EE7ljoUM7Ot1u3zCNJf79/awnOPJxmN3G4hcEiHY35BMj5hMTAEODUi2jqeSNJ5wHkAO+1U2pmv1zY0c/eMxXxk/wkM7ucHvJlZ+Stku0Zno6uiw/qxJIPVxgH7Ab+QNHSTD0VcGxFTImLK6NGjt32km+H25xaxobmNMw7xoxjMrDIUMhEsBHLbTiaw6RxFnwX+HInZwFyg196GExHc/PQb7DN+GHuPH1bqcMzMtolCJoJngN0lTZJUQzIQ7c4Ox7xB0umMpDHAnsCcAsa0VV5aUssrS9e5b8DMKkqPiSC9TfRMSZem6ztJOrinz0VEC3AhcD9JB/OtEfFiOnXF+elhlwHvlvQC8BDw9YhYsaWFKbTbn1tE32px4j47lDoUM7NtJp/ezl+STD39fuAHwDrgT8BBPX0wIu4hucMod9s1OcuLgWM2I96SaW0L7pi+mCP33J4Rg2pKHY6Z2TaTTyI4JCIOkPQcJFNMpE09mfLP11eybF0jp+w3vtShmJltU/n0ETSng8MCIB34tcktnpXu9umLGNKvDx/Ya/tSh2Jmtk3lkwh+DvwF2F7Sj4DHgf8oaFS9TENTK/fNXMrx+4z1E8jMrOL02DQUETdKmkZyd4+AUyLi5YJH1ov87eU3Wd/Ywin7u1nIzCpPj4lA0qHAixFxVbo+RNIhEfFUwaPrJe58fjFjh/bn0El++IyZVZ58moauBtbnrNel2zKhsaWVx19bwdGTt6eqyo+iNLPKk08iUES8NTVEOhdQZibZeXruKhqaW3n/O9xJbGaVKZ9EMEfSlyX1TV9foReP/t3WHn5lGf36VHHYLtuVOhQzs4LIJxGcD7wbWMTbM4ieV8igepNHZy3nsF1HMaDGdwuZWWXK566hZSTzBGXO3BV1zF1Rx9nvnljqUMzMCiafu4ZGA/8CTMw9PiI+V7iweofHX1sOwBF7lHbqazOzQsqn0/cO4B/A34BNnl1cyZ6cs4pxw/qz86iBpQ7FzKxg8kkEAyPi6wWPpJeJCJ6cs5Ij9hhN8iRNM7PKlE9n8d2SPlTwSHqZ2cvWs7KuiUN38SAyM6ts+SSCr5AkgwZJtZLWSaotdGCl9uSclQBOBGZW8fK5a2hIMQLpbdr7B3YcOaDUoZiZFVReI4QljQB2B/q3b4uIvxcqqFKLCJ6au5L37e7+ATOrfPncPnouSfPQBGA6cCjwT5InllWkhasbWLG+iQN2HlHqUMzMCi7fPoKDgPkRcRSwP7C8oFGV2PML1wCw34ThJY7EzKzw8kkEGyJiA4CkfhHxCrBnYcMqrRkL11JTXcWeYzPZPWJmGZNPH8FCScOB24EHJa0GFhc2rNKavmANe40bSk2ffPKkmVl5y+euoY+ki9+T9AgwDLivoFGVUGtbMHPRWj5+4IRSh2JmVhRdJgJJQyOiVtLInM0vpO+DgVUFjaxE5q+so76plb3HDSt1KGZmRdFdjeAm4ERgGhAkzyvOfd+l4NGVwKyl6wDcP2BmmdFlIoiIE5XcRH9ERLxRxJhK6pWl65BgjzFOBGaWDd32hqaPqPxLkWLpFWYtXcfEUYP8IBozy4x8bot5UtJBBY+kl5j15jr2dG3AzDIkn0RwFPBPSa9LmiHpBUkzCh1YKTQ0tTJvZZ37B8wsU/IZR3B8waPoJeatrCMCdh8zuNShmJkVTT7jCOYDSNqenEnnKtH8lXUATBw1qMSRmJkVT49NQ5I+LOk1YC7wGDAPuLfAcZXEvJX1AOzkR1OaWYbk00dwGcmMo69GxCTgA8D/FTSqEpm/so5Rg2oY2r9vqUMxMyuafBJBc0SsBKokVUXEI8B+BY6rJOatqPeD6s0sc/LpLF4jaTDwd+BGScuAlsKGVRpvrKrnkEkjez7QzKyC5FMjOBloAP6NZLK514GT8jm5pOMkzZI0W9I3ujjmSEnTJb0o6bF8A9/WNjS3snhtg/sHzCxzupt07hfATRHxRM7m6/M9saRq4Crgg8BC4BlJd0bESznHDAd+CRwXEW+kdyaVxMLV9UT4jiEzy57uagSvAVdImifpckmb2y9wMDA7IuZERBNwC0ntItengD+3z2UUEcs28zu2mXkrkjuG3EdgZlnTZSKIiP+OiMOAI0imnP6dpJclXSppjzzOPR5YkLO+MN2Waw9ghKRHJU2TdFZnJ5J0nqSpkqYuX16Yp2TO8xgCM8uoHvsIImJ+RFweEfuT/IL/CPByHudWZ6frsN4HOBA4ATgW+E5nSSYiro2IKRExZfTo0Xl89eabv7Keof37MHygbx01s2zJZ0BZX0knSbqRZCDZq8DH8jj3QmDHnPUJbPqIy4XAfRFRFxErSO5MeldekW9j81fVs/OoQSQzb5uZZUeXiUDSByVdR3KxPg+4B9g1Ik6NiNvzOPczwO6SJkmqAU4D7uxwzB3A+yT1kTQQOIT8ahvb3KLV9UwYMaAUX21mVlLdjSP4JslTyi6OiM1+LGVEtEi6ELgfqAaui4gXJZ2f7r8mIl6WdB8wA2gDfhMRMze7FFspIli0poEj9yzZTUtmZiXT3RPKjtrak0fEPSQ1idxt13RY/wnwk639rq2xur6ZDc1tjB/uGoGZZU8+A8oq3uI1DQCMcyIwswxyIgAWrk4SgWsEZpZFTgS8XSMY785iM8ug7qaYWMem9/2/JSKGFiSiEli8poH+fasY4TEEZpZB3XUWDwGQ9ANgKfBHkkFiZwAV9VDfRWsaGDd8gMcQmFkm5dM0dGxE/DIi1kVEbURcTX4DysrG4jUN7h8ws8zKJxG0SjpDUrWkKklnAK2FDqyYltZuYOzQin4cs5lZl/JJBJ8CPgm8mb4+kW6rCK1twfJ1jYxxIjCzjOrxCWURMY9Np4+uGCvrGmkLGDO0X6lDMTMriXwmndtD0kOSZqbr+0r6duFDK45ltY0AbO8agZllVD5NQ78GLgGaASJiBskEchXhzdoNAGw/xDUCM8umfBLBwIh4usO2inl4/bJ1SY3AfQRmllX5JIIVknYlHVwm6ePAkoJGVUTtNYLRrhGYWUb12FkMfBG4FniHpEXAXODMgkZVRG/WNjJqUA19qz3bhpllUz53Dc0BjpY0CKiKiHWFD6t4lq/b4I5iM8u07uYaOjMibpB0UYftAETETwscW1G8WdvoW0fNLNO6qxEMSt8ral6hjt6s3cDkHSpm/jwzs83W3aRzv0rfv1+8cIqrtS1Ysb6R7V0jMLMM67GPQFJ/4BzgncBbjekR8bkCxlUUK9cno4rdR2BmWZbPrTJ/BMYCxwKPAROAiugwfjMdVTzGt46aWYblkwh2i4jvAHURcT1wArBPYcMqjmXr0lHFrhGYWYblkwia0/c1kvYGhgETCxZREb1VI3AfgZllWD4Dyq6VNAL4DnAnMBi4tKBRFcmK9UkiGDXIicDMsiufAWW/SRcfA3YpbDjFtaquiSH9+1DTx6OKzSy7uhtQdlFX+6AyBpStqmti1KCaUodhZlZS3dUIKnogGSSJYKQTgZllXHcDyip2IFm7lXVNjB/uO4bMLNvyeULZLpLukrRc0jJJd0iqiL6C1a4RmJnldfvoTcCtwA7AOOB/gZsLGVQxRASr6poY4URgZhmXTyJQRPwxIlrS1w2kD6kpZ3VNrTS1trmz2MwyL59xBI9I+gZwC0kCOBX4q6SRABGxqoDxFcyq9U0AjBjoRGBm2ZZPIjg1ff98h+2fI0kMZdlfsLIuHUw22InAzLItnwFlk4oRSLGtrk9qBCM9qtjMMi6fu4Yuk1Sdsz5U0u/yObmk4yTNkjQ7bV7q6riDJLVK+nh+YW+9lWnT0Eg3DZlZxuXTWdwHeFrSvpKOAZ4BpvX0oTR5XAUcD0wGTpc0uYvjLgfu35zAt9aqujQRuGnIzDIun6ahSyQ9BDwFrAYOj4jZeZz7YGB2RMwBkHQLcDLwUofjvgT8CThocwLfWqvqm6iprmJQTXXPB5uZVbB8moYOB/4b+AHwKPALSePyOPd4YEHO+sJ0W+65xwMfAa7pIYbzJE2VNHX58uV5fHXPVq1PBpNJ2ibnMzMrV/ncNfRfwCci4iUASR8FHgbe0cPnOrvCdhx/cCXw9Yho7e6CHBHXAtcCTJkyZZuMYVhd71HFZmaQXyI4LCJa21ci4s+SHsvjcwuBHXPWJwCLOxwzBbglTQLbAR+S1BIRt+dx/q2y0tNLmJkB3TQNSboSIP21/pUOu6/I49zPALtLmiSpBjiN5ME2b4mISRExMSImArcBFxQjCYBnHjUza9ddH8HhOcuf6bBv355OHBEtwIUkdwO9DNwaES9KOl/S+Zsd6TbmRGBmluiuaUhdLOctIu4B7umwrdOO4Yg4e0u+Y0s0tbSxbkOLE4GZGd0ngqr0WcVVOcvtCaGs77lc09A+z1DfEkdiZlZ63SWCYSQDx9ov/s/m7Cvr2UdrG1oAGDrAicDMrLsnlE0sYhxFVbuhGXAiMDOD/KaYqDhrG5JEMMyJwMwsm4mgNk0EQ/s7EZiZdTeOoCKnnwao3dDeR5DPeDozs8rWXY3gNoB0wrmK4hqBmdnberp99LvAHpIu6rgzIn5auLAKq3ZDM/36VNG/b1nfBWtmtk10VyM4DdhAkiyGdPIqW7UNzb5jyMws1d3to7OAyyXNiIh7ixhTwdU2tDC0v/sHzMwgv7uGnpD00/bnAUi6QtKwgkdWQLUbXCMwM2uXTyK4DlgHfDJ91QJ5PbO4t6ptaHZHsZlZKp/2kV0j4mM569+XNL1QARVD7YYWdh41qNRhmJn1CvnUCBokvbd9RdJ7gIbChVR4axuaPYbAzCyVz9XwfOAPOf0Cq9n0+QRlIyLcNGRmlqPHRBARzwPvkjQ0Xa8teFQF1NDcSktbuLPYzCyVd/tIuSeAdm9NQe0agZkZkMFJ596egtp9BGZmkMFEsL4xqREM6udEYGYGeTQNSaoGTgAm5h5frnMN1aWJYLATgZkZkF8fwV0kcw69ALQVNpzCa08Eg2qcCMzMIL9EMCEi9i14JEWyvrEVcI3AzKxdPn0E90o6puCRFMlbNYJ+noLazAzyqxE8CfxFUhXQDAiIiBha0MgKxJ3FZmYby+dqeAVwGPBCRESB4ym4usYW+lSJfn0yd8OUmVmn8rkavgbMrIQkAEkiGNSvD5JKHYqZWa+QT41gCfCopHuBxvaN5Xr76PrGVncUm5nlyOeKODd91aSvspbUCNxRbGbWLp9J575fjECKpa6pxR3FZmY58hlZ/AiwSf9ARLy/IBEV2PrGFjcNmZnlyOeKeHHOcn/gY0BLYcIpvLrGFrYf0q/UYZiZ9Rr5NA1N67Dp/yQ9VqB4Cq6usdVNQ2ZmOfJpGhqZs1oFHAiMLVhEBeamITOzjeVzRZxG0kcgkiahucA5hQyqUCLirXEEZmaWyKdpaNKWnlzSccB/A9XAbyLixx32nwF8PV1dD3whfTRmQTS2tNHSFq4RmJnl6HJksaSDJI3NWT9L0h2Sft6huairz1cDVwHHA5OB0yVN7nDYXOCIdHbTy4Brt6QQ+Xp7CmqPIzAza9fdFBO/ApoAJB0O/Bj4A7CW/C7YBwOzI2JORDQBtwAn5x4QEU9ExOp09UlgwuaFv3nq0imo3TRkZva27hJBdUSsSpdPBa6NiD9FxHeA3fI493hgQc76wnRbV84B7u1sh6TzJE2VNHX58uV5fHXn1vvpZGZmm+g2EUhqv2J+AHg4Z18+V9LOZnXrdOI6SUeRJIKvd7Y/Iq6NiCkRMWX06NF5fHXn6po8BbWZWUfdXRFvBh6TtAJoAP4BIGk3kuahniwEdsxZnwAs7niQpH2B3wDHR8TKPOPeIvVN7U1D7iMwM2vXZSKIiB9JegjYAXggZxrqKuBLeZz7GWB3SZOARcBpwKdyD5C0E/Bn4NMR8eoWxL9Z6tOmoQF9XSMwM2vX7RUxIp7sZFteF+yIaJF0IXA/ye2j10XEi5LOT/dfA1wKjAJ+mT4foCUipmxeEfLXXiMY6LuGzMzeUtCfxhFxD3BPh23X5CyfC5xbyBhy1Tc7EZiZdZSp5zU2pJ3FA5wIzMzekqlE8HbTkPsIzMzaZSoRNDS1UtOniuoqP6/YzKxdphJBfVOr+wfMzDrIXiLo60RgZpYrU4mgobmFgR5VbGa2kUwlAjcNmZltKnOJYICbhszMNpKpRNDgGoGZ2SYylQjqmlo8hsDMrINMJYKGplaPKjYz6yBTicCdxWZmm8pUInCNwMxsU5lJBC2tbTS1tjHQzyIwM9tIZhKBp6A2M+tcZhJBQwRV/ooAAAitSURBVDrzqJuGzMw2lplE4KeTmZl1LkOJIHkojccRmJltLDOJoME1AjOzTmUmEbhpyMysc5lLBO4sNjPbWGYSweghNRy/91hGDepX6lDMzHqVzPScHrjzSA7ceWSpwzAz63UyUyMwM7POORGYmWWcE4GZWcY5EZiZZZwTgZlZxjkRmJllnBOBmVnGORGYmWWcIqLUMWwWScuB+Vv48e2AFdswnN6kUsvmcpWXSi0XlH/Zdo6I0Z3tKLtEsDUkTY2IKaWOoxAqtWwuV3mp1HJBZZfNTUNmZhnnRGBmlnFZSwTXljqAAqrUsrlc5aVSywUVXLZM9RGYmdmmslYjMDOzDpwIzMwyLjOJQNJxkmZJmi3pG6WOZ3NIuk7SMkkzc7aNlPSgpNfS9xE5+y5JyzlL0rGlibpnknaU9IiklyW9KOkr6fayLpuk/pKelvR8Wq7vp9vLulztJFVLek7S3el6pZRrnqQXJE2XNDXdVhFl61FEVPwLqAZeB3YBaoDngcmljmsz4j8cOACYmbPtP4FvpMvfAC5Plyen5esHTErLXV3qMnRRrh2AA9LlIcCrafxlXTZAwOB0uS/wFHBouZcrp3wXATcBd1fKv8U03nnAdh22VUTZenplpUZwMDA7IuZERBNwC3ByiWPKW0T8HVjVYfPJwPXp8vXAKTnbb4mIxoiYC8wmKX+vExFLIuLZdHkd8DIwnjIvWyTWp6t901dQ5uUCkDQBOAH4Tc7msi9XNyq5bG/JSiIYDyzIWV+YbitnYyJiCSQXVGD7dHtZllXSRGB/kl/PZV+2tPlkOrAMeDAiKqJcwJXA14C2nG2VUC5IkvUDkqZJOi/dVill61ZWHl6vTrZV6n2zZVdWSYOBPwH/GhG1UmdFSA7tZFuvLFtEtAL7SRoO/EXS3t0cXhblknQisCwipkk6Mp+PdLKt15Urx3siYrGk7YEHJb3SzbHlVrZuZaVGsBDYMWd9ArC4RLFsK29K2gEgfV+Wbi+rskrqS5IEboyIP6ebK6JsABGxBngUOI7yL9d7gA9LmkfSvPp+STdQ/uUCICIWp+/LgL+QNPVURNl6kpVE8Aywu6RJkmqA04A7SxzT1roT+Ey6/Bngjpztp0nqJ2kSsDvwdAni65GSn/6/BV6OiJ/m7CrrskkandYEkDQAOBp4hTIvV0RcEhETImIiyf9DD0fEmZR5uQAkDZI0pH0ZOAaYSQWULS+l7q0u1gv4EMldKa8D3yp1PJsZ+83AEqCZ5JfIOcAo4CHgtfR9ZM7x30rLOQs4vtTxd1Ou95JUp2cA09PXh8q9bMC+wHNpuWYCl6bby7pcHcp4JG/fNVT25SK5o/D59PVi+zWiEsqWz8tTTJiZZVxWmobMzKwLTgRmZhnnRGBmlnFOBGZmGedEYGaWcU4EVnCSQtIVOesXS/reNjr37yV9fFucq4fv+UQ6S+ojnezbQ9I96UyUL0u6VdKYQsdUSJJOkTS51HFYcTgRWDE0Ah+VtF2pA8klqXozDj8HuCAijupwjv7AX4GrI2K3iNgLuBoYve0iLYlTSGbYtAxwIrBiaCF53uu/ddzR8Re9pPXp+5GSHkt/Xb8q6ceSzkjn+X9B0q45pzla0j/S405MP18t6SeSnpE0Q9Lnc877iKSbgBc6ief09PwzJV2ebruUZPDbNZJ+0uEjnwL+GRF3tW+IiEciYqaS5xL8Lj3fc5KOSs93tqTbJd0laa6kCyVdlB7zpKSR6XGPSrpS0hNpPAen20emn5+RHr9vuv17Sp5d8aikOZK+nFOuM9O/3XRJv2pPgpLWS/qRkmcnPClpjKR3Ax8GfpIev6ukL0t6Kf3OW/L5j25lpNQj2vyq/BewHhhKMt/7MOBi4Hvpvt8DH889Nn0/ElhD8syCfsAi4Pvpvq8AV+Z8/j6SHzW7k4y87g+cB3w7PaYfMJVk3vgjgTpgUidxjgPeIPk13wd4GDgl3fcoMKWTz/wU+EoX5f4q8Lt0+R3pufsDZ5NMWzwk/a61wPnpcT8jmXyv/Tt/nS4fTvo8CuB/gO+my+8HpqfL3wOeSMu7HbCSZArsvYC7gL7pcb8EzkqXAzgpXf7PnL9Zx/8ui4F+6fLwUv+b8mvbvlwjsKKIiFrgD8CXezo2xzORPLOgkWQo/wPp9heAiTnH3RoRbRHxGjCH5KJ7DHCWkqmgnyKZKmD39PinI5lDvqODgEcjYnlEtAA3klyAt9R7gT8CRMQrwHxgj3TfIxGxLiKWkySC9hpFx7LdnH7+78DQdA6j3PM+DIySNCw9/q+RzJG/gmSCtDHAB4ADgWfSv8cHSKZUAGgC7k6Xp3X47lwzgBslnUlSw7MKkpVpqK13uBJ4FvhdzrYW0ibKdBK6mpx9jTnLbTnrbWz8b7fjPClBMk3wlyLi/twdSqZPrusivi7nv+7Gi8ARW3C+rS1bR+3H5Z63NT2XgOsj4pJOPtccEdHh+M6cQJIUPwx8R9I702RpFcA1AiuaiFgF3ErS8dpuHsmvVUie+tR3C079CUlVab/BLiSTgN0PfEHJNNftd/YM6uE8TwFHSNoubUM/HXish8/cBLxb0gntG5Q8H3sf4O/AGe3fD+yUxrY5Tk0//15gbUSs7XDeI4EVaY2rKw8BH1cyz357H8POPXzvOpKmKyRVATtGxCMkD6UZDgzezHJYL+YagRXbFcCFOeu/Bu6Q9DTJBaurX+vdmUVywR5D0ta+QdJvSJo5nk1rGst5+zGDnYqIJZIuAR4h+RV9T0Tc0cNnGtIO6islXUkyQ+wMkn6MX5J0ML9AUvM5OyIa1fWDdzqzWtITJH0sn0u3fQ/4naQZQD1vT5PcVYwvSfo2ydO3qtIYv0jSVNWVW4Bfpx3OpwG/TZufBPwskucsWIXw7KNmvZSkR4GLI2JqqWOxyuamITOzjHONwMws41wjMDPLOCcCM7OMcyIwM8s4JwIzs4xzIjAzy7j/D6hnuWY2KfbNAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "dim_red.dim_exploration(X_data_std)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we can see from the explained variance that we need about 100 features "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 104
    },
    "colab_type": "code",
    "id": "vpODgM1JkJLg",
    "outputId": "15675dc7-b711-40dd-a4b0-d5fffd590548"
   },
   "outputs": [],
   "source": [
    "# doign only PCA with 100 components\n",
    "X_pca  = dim_red.dim_red_pca_only(pd.DataFrame(X_data_std),100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "id": "EJnfSOxrlPmm",
    "outputId": "13faa6de-dd53-4fb9-d6a2-46286f02aed2"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(79073, 100)"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_pca.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Additionally, use a simple supervised model to compare out-of-sample accuracy using other dimensionality reduction \n",
    "and feature extraction techniques. Now we will use a reduced dataset to embed it and train a deep model."
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "Dim_reduction.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
